library(tidyverse)
library(janitor)
library(knitr)
library(here)
# Optional packages
has_naniar <- suppressWarnings(require(naniar, quietly = TRUE))
has_skimr <- suppressWarnings(require(skimr, quietly = TRUE))
has_gt <- suppressWarnings(require(gt, quietly = TRUE))
# File paths using here() for robust path resolution
primary_file <- here("data", "grf_shooting_clean.csv")
fallback_file <- here("data", "GRF Shooting SM[40].csv")
# Load data with fallback
if (file.exists(primary_file)) {
df <- read_csv(primary_file, show_col_types = FALSE)
cat("Loaded primary file:", primary_file, "\n")
} else if (file.exists(fallback_file)) {
df <- read_csv(fallback_file, show_col_types = FALSE) %>%
janitor::clean_names()
cat("Loaded fallback file:", fallback_file, "\n")
cat("Applied janitor::clean_names()\n")
} else {
stop("ERROR: Neither data file found!\n",
"Expected: ", primary_file, " or ", fallback_file)
}
## Loaded primary file: /Users/samuelmontalvo/Documents/GitHub/basketball_shooting_project/data/grf_shooting_clean.csv
cat("\nDimensions:", nrow(df), "rows x", ncol(df), "columns\n")
##
## Dimensions: 460 rows x 18 columns
cat("Column names:\n")
## Column names:
cat(paste(names(df), collapse = "\n"))
## player_id
## shot_number
## shot_type
## made
## position
## height
## body_mass
## peak_braking_force
## peak_braking_power
## peak_propulsive_force
## peak_propulsive_power
## left_leg_peak_propulsive_force
## right_leg_peak_propulsive_force
## jump_height
## braking_duration
## propulsive_duration
## displacement_depth
## force_at_min_displacement
# Helper function to find columns by keyword
find_col <- function(cols, patterns) {
for (p in patterns) {
matches <- cols[str_detect(tolower(cols), tolower(p))]
if (length(matches) > 0) return(matches[1])
}
return(NA_character_)
}
# Find player_id
player_id_col <- find_col(names(df), c("^player_id$", "^player$", "subject", "athlete", "^pid$", "^id$"))
if (is.na(player_id_col)) {
stop("ERROR: Could not identify player_id column.\n",
"Expected column containing: player, pid, athlete, subject, or id")
}
# Rename to player_id if needed
if (player_id_col != "player_id") {
df <- df %>% rename(player_id = !!sym(player_id_col))
}
df <- df %>% mutate(player_id = as.character(player_id))
# Find shot_type
shot_type_col <- find_col(names(df), c("^shot_type$", "shot.*type", "type.*2.*3"))
has_shot_type <- !is.na(shot_type_col)
if (has_shot_type && shot_type_col != "shot_type") {
df <- df %>% rename(shot_type = !!sym(shot_type_col))
}
if (has_shot_type) {
df <- df %>%
mutate(shot_type = case_when(
shot_type %in% c(2, "2", "2PT", "2pt") ~ "2PT",
shot_type %in% c(3, "3", "3PT", "3pt") ~ "3PT",
TRUE ~ as.character(shot_type)
))
}
# Find made/result
made_col <- find_col(names(df), c("^made$", "miss.*made", "result", "outcome", "make"))
has_made <- !is.na(made_col)
if (has_made && made_col != "made") {
df <- df %>% rename(made = !!sym(made_col))
}
if (has_made) {
df <- df %>% mutate(made = as.integer(made))
}
# Report findings
cat("Column Mapping:\n")
## Column Mapping:
cat("─────────────────────────────────────\n")
## ─────────────────────────────────────
cat("player_id:", player_id_col, "→ player_id\n")
## player_id: player_id → player_id
cat("shot_type:", ifelse(has_shot_type, shot_type_col, "NOT FOUND"), "\n")
## shot_type: shot_type
cat("made: ", ifelse(has_made, made_col, "NOT FOUND"), "\n")
## made: made
missingness_by_col <- df %>%
summarise(across(everything(), ~ sum(is.na(.x)))) %>%
pivot_longer(everything(), names_to = "column", values_to = "n_missing") %>%
mutate(
n_total = nrow(df),
pct_missing = round(100 * n_missing / n_total, 2)
) %>%
arrange(desc(pct_missing))
kable(missingness_by_col, caption = "Missingness by Column")
| column | n_missing | n_total | pct_missing |
|---|---|---|---|
| player_id | 0 | 460 | 0 |
| shot_number | 0 | 460 | 0 |
| shot_type | 0 | 460 | 0 |
| made | 0 | 460 | 0 |
| position | 0 | 460 | 0 |
| height | 0 | 460 | 0 |
| body_mass | 0 | 460 | 0 |
| peak_braking_force | 0 | 460 | 0 |
| peak_braking_power | 0 | 460 | 0 |
| peak_propulsive_force | 0 | 460 | 0 |
| peak_propulsive_power | 0 | 460 | 0 |
| left_leg_peak_propulsive_force | 0 | 460 | 0 |
| right_leg_peak_propulsive_force | 0 | 460 | 0 |
| jump_height | 0 | 460 | 0 |
| braking_duration | 0 | 460 | 0 |
| propulsive_duration | 0 | 460 | 0 |
| displacement_depth | 0 | 460 | 0 |
| force_at_min_displacement | 0 | 460 | 0 |
p_missing <- missingness_by_col %>%
filter(pct_missing > 0 | row_number() <= 10) %>%
ggplot(aes(x = reorder(column, pct_missing), y = pct_missing)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Missing Data by Column",
x = NULL,
y = "% Missing"
) +
theme_minimal()
print(p_missing)
n_duplicates <- sum(duplicated(df))
cat("Number of fully duplicated rows:", n_duplicates, "\n")
## Number of fully duplicated rows: 0
if ("shot_number" %in% names(df) && has_shot_type) {
dup_trials <- df %>%
group_by(player_id, shot_type, shot_number) %>%
filter(n() > 1) %>%
nrow()
cat("Duplicate (player_id, shot_type, shot_number) combinations:", dup_trials, "\n")
}
## Duplicate (player_id, shot_type, shot_number) combinations: 0
trials_per_player <- df %>%
count(player_id, name = "n_trials") %>%
arrange(player_id)
kable(trials_per_player, caption = "Trials per Player")
| player_id | n_trials |
|---|---|
| 1 | 20 |
| 10 | 20 |
| 11 | 20 |
| 12 | 20 |
| 13 | 20 |
| 14 | 20 |
| 15 | 20 |
| 16 | 20 |
| 17 | 20 |
| 18 | 20 |
| 19 | 20 |
| 2 | 20 |
| 20 | 20 |
| 21 | 20 |
| 22 | 20 |
| 23 | 20 |
| 3 | 20 |
| 4 | 20 |
| 5 | 20 |
| 6 | 20 |
| 7 | 20 |
| 8 | 20 |
| 9 | 20 |
trials_by_type <- df %>%
count(player_id, shot_type) %>%
pivot_wider(names_from = shot_type, values_from = n, values_fill = 0) %>%
mutate(
total = `2PT` + `3PT`,
balanced = ifelse(`2PT` == 10 & `3PT` == 10, "✓", "✗")
)
kable(trials_by_type, caption = "Trials by Shot Type (Expected: 10/10)")
| player_id | 2PT | 3PT | total | balanced |
|---|---|---|---|---|
| 1 | 10 | 10 | 20 | ✓ |
| 10 | 10 | 10 | 20 | ✓ |
| 11 | 10 | 10 | 20 | ✓ |
| 12 | 10 | 10 | 20 | ✓ |
| 13 | 10 | 10 | 20 | ✓ |
| 14 | 10 | 10 | 20 | ✓ |
| 15 | 10 | 10 | 20 | ✓ |
| 16 | 10 | 10 | 20 | ✓ |
| 17 | 10 | 10 | 20 | ✓ |
| 18 | 10 | 10 | 20 | ✓ |
| 19 | 10 | 10 | 20 | ✓ |
| 2 | 10 | 10 | 20 | ✓ |
| 20 | 10 | 10 | 20 | ✓ |
| 21 | 10 | 10 | 20 | ✓ |
| 22 | 10 | 10 | 20 | ✓ |
| 23 | 10 | 10 | 20 | ✓ |
| 3 | 10 | 10 | 20 | ✓ |
| 4 | 10 | 10 | 20 | ✓ |
| 5 | 10 | 10 | 20 | ✓ |
| 6 | 10 | 10 | 20 | ✓ |
| 7 | 10 | 10 | 20 | ✓ |
| 8 | 10 | 10 | 20 | ✓ |
| 9 | 10 | 10 | 20 | ✓ |
complete_players <- sum(trials_by_type$balanced == "✓")
cat("\nPlayers with complete data (10 2PT + 10 3PT):", complete_players, "of", nrow(trials_by_type), "\n")
##
## Players with complete data (10 2PT + 10 3PT): 23 of 23
# Identify numeric columns
exclude_cols <- c("player_id", "shot_type", "made", "position")
numeric_cols <- df %>%
select(-any_of(exclude_cols)) %>%
select(where(is.numeric)) %>%
names()
cat("Identified", length(numeric_cols), "numeric columns\n")
## Identified 14 numeric columns
# Create summary table
numeric_summary <- df %>%
select(all_of(numeric_cols)) %>%
summarise(across(everything(), list(
mean = ~ mean(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
iqr = ~ IQR(.x, na.rm = TRUE),
min = ~ min(.x, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
p1 = ~ quantile(.x, 0.01, na.rm = TRUE),
p99 = ~ quantile(.x, 0.99, na.rm = TRUE),
pct_na = ~ round(100 * sum(is.na(.x)) / length(.x), 2)
))) %>%
pivot_longer(
everything(),
names_to = c("variable", "stat"),
names_pattern = "^(.*)_(mean|sd|median|iqr|min|max|p1|p99|pct_na)$"
) %>%
pivot_wider(names_from = stat, values_from = value) %>%
mutate(across(where(is.numeric), ~ round(.x, 2))) %>%
arrange(variable)
kable(numeric_summary, caption = "Numeric Variable Summary Statistics")
| variable | mean | sd | median | iqr | min | max | p1 | p99 | pct_na |
|---|---|---|---|---|---|---|---|---|---|
| body_mass | 91.29 | 12.77 | 90.68 | 12.07 | 68.88 | 123.97 | 68.88 | 123.97 | 0 |
| braking_duration | 0.42 | 0.47 | 0.29 | 0.12 | 0.08 | 2.81 | 0.11 | 2.69 | 0 |
| displacement_depth | 21.95 | 19.42 | 17.70 | 8.05 | 0.00 | 184.80 | 0.79 | 100.73 | 0 |
| force_at_min_displacement | 1500.07 | 381.29 | 1518.00 | 396.50 | 7.90 | 2643.00 | 11.20 | 2333.87 | 0 |
| height | 195.35 | 6.49 | 195.00 | 10.00 | 180.00 | 206.00 | 180.00 | 206.00 | 0 |
| jump_height | 9.69 | 5.05 | 8.90 | 7.40 | 1.30 | 27.10 | 1.60 | 23.08 | 0 |
| left_leg_peak_propulsive_force | 1048.32 | 201.98 | 1018.00 | 302.00 | 609.00 | 1567.00 | 653.95 | 1513.97 | 0 |
| peak_braking_force | 1532.33 | 308.83 | 1520.50 | 378.75 | 827.00 | 2643.00 | 979.16 | 2333.87 | 0 |
| peak_braking_power | 601.14 | 292.79 | 594.50 | 379.25 | -0.14 | 2131.00 | 59.36 | 1540.15 | 0 |
| peak_propulsive_force | 1959.21 | 343.43 | 1911.50 | 529.75 | 1099.00 | 2914.00 | 1194.96 | 2827.27 | 0 |
| peak_propulsive_power | 2753.89 | 734.59 | 2711.00 | 975.25 | 544.00 | 7311.00 | 1326.22 | 4580.10 | 0 |
| propulsive_duration | 0.31 | 0.25 | 0.29 | 0.08 | 0.14 | 2.71 | 0.18 | 1.47 | 0 |
| right_leg_peak_propulsive_force | 940.41 | 167.93 | 926.50 | 207.25 | 568.00 | 1513.00 | 603.85 | 1421.17 | 0 |
| shot_number | 5.50 | 2.88 | 5.50 | 5.00 | 1.00 | 10.00 | 1.00 | 10.00 | 0 |
global_outliers <- tibble(
column = character(),
n_outliers = integer(),
pct_outliers = numeric()
)
for (col in numeric_cols) {
x <- df[[col]]
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
n_out <- sum(x < lower | x > upper, na.rm = TRUE)
pct_out <- round(100 * n_out / sum(!is.na(x)), 2)
global_outliers <- global_outliers %>%
add_row(column = col, n_outliers = n_out, pct_outliers = pct_out)
}
within_player_outliers <- tibble(
column = character(),
n_outliers = integer()
)
# Players with >= 5 trials
valid_players <- df %>%
count(player_id) %>%
filter(n >= 5) %>%
pull(player_id)
df_valid <- df %>% filter(player_id %in% valid_players)
for (col in numeric_cols) {
z_scores <- df_valid %>%
group_by(player_id) %>%
mutate(
col_mean = mean(.data[[col]], na.rm = TRUE),
col_sd = sd(.data[[col]], na.rm = TRUE),
z = ifelse(col_sd > 0, (.data[[col]] - col_mean) / col_sd, 0)
) %>%
ungroup() %>%
pull(z)
n_out <- sum(abs(z_scores) > 3, na.rm = TRUE)
within_player_outliers <- within_player_outliers %>%
add_row(column = col, n_outliers = n_out)
}
outlier_summary <- global_outliers %>%
rename(global_outliers = n_outliers, global_pct = pct_outliers) %>%
left_join(
within_player_outliers %>% rename(within_player_outliers = n_outliers),
by = "column"
) %>%
arrange(desc(global_outliers + within_player_outliers))
kable(outlier_summary, caption = "Outlier Summary by Column")
| column | global_outliers | global_pct | within_player_outliers |
|---|---|---|---|
| displacement_depth | 59 | 12.83 | 8 |
| braking_duration | 54 | 11.74 | 9 |
| body_mass | 40 | 8.70 | 0 |
| force_at_min_displacement | 18 | 3.91 | 3 |
| propulsive_duration | 11 | 2.39 | 6 |
| peak_braking_force | 10 | 2.17 | 4 |
| peak_braking_power | 7 | 1.52 | 4 |
| right_leg_peak_propulsive_force | 10 | 2.17 | 0 |
| peak_propulsive_power | 4 | 0.87 | 3 |
| jump_height | 3 | 0.65 | 0 |
| shot_number | 0 | 0.00 | 0 |
| height | 0 | 0.00 | 0 |
| peak_propulsive_force | 0 | 0.00 | 0 |
| left_leg_peak_propulsive_force | 0 | 0.00 | 0 |
key_keywords <- c("force", "power", "jump", "brak", "propul", "duration", "depth")
# Get keyword matches
keyword_matches <- numeric_cols[str_detect(tolower(numeric_cols),
paste(key_keywords, collapse = "|"))]
# Get top 10 by variance
var_ranking <- df %>%
select(all_of(numeric_cols)) %>%
summarise(across(everything(), ~ var(.x, na.rm = TRUE))) %>%
pivot_longer(everything(), names_to = "column", values_to = "variance") %>%
arrange(desc(variance))
top_by_variance <- var_ranking %>% slice_head(n = 10) %>% pull(column)
# Combine
key_vars <- unique(c(keyword_matches, top_by_variance))
key_vars <- key_vars[1:min(12, length(key_vars))]
cat("Key variables selected for visualization:\n")
## Key variables selected for visualization:
cat(paste(key_vars, collapse = "\n"))
## peak_braking_force
## peak_braking_power
## peak_propulsive_force
## peak_propulsive_power
## left_leg_peak_propulsive_force
## right_leg_peak_propulsive_force
## jump_height
## braking_duration
## propulsive_duration
## displacement_depth
## force_at_min_displacement
## body_mass
for (var in key_vars) {
p <- df %>%
ggplot(aes(x = .data[[var]])) +
geom_histogram(aes(y = after_stat(density)), bins = 30,
fill = "steelblue", alpha = 0.7) +
geom_density(color = "darkblue", linewidth = 1) +
labs(title = paste("Distribution:", var), x = var, y = "Density") +
theme_minimal()
print(p)
}
for (var in key_vars) {
cat("\n\n### ", var, "\n\n")
p <- df %>%
ggplot(aes(x = shot_type, y = .data[[var]], fill = shot_type)) +
geom_violin(alpha = 0.5) +
geom_boxplot(width = 0.2, alpha = 0.8, outlier.shape = NA) +
geom_jitter(width = 0.1, alpha = 0.3, size = 1) +
scale_fill_manual(values = c("2PT" = "#1f77b4", "3PT" = "#ff7f0e")) +
labs(title = paste(var, "by Shot Type"), x = "Shot Type", y = var) +
theme_minimal() +
theme(legend.position = "none")
print(p)
}
jump_col <- find_col(numeric_cols, c("jump_height", "jump", "height"))
has_jump <- !is.na(jump_col)
if (has_jump) {
cat("Jump height variable identified:", jump_col, "\n")
} else {
cat("No jump height variable found\n")
}
## Jump height variable identified: jump_height
player_means <- df %>%
group_by(player_id, shot_type) %>%
summarise(mean_value = mean(.data[[jump_col]], na.rm = TRUE), .groups = "drop")
player_wide <- player_means %>%
pivot_wider(names_from = shot_type, values_from = mean_value)
p_paired <- player_wide %>%
ggplot(aes(x = `2PT`, y = `3PT`)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") +
geom_point(size = 4, alpha = 0.7, color = "steelblue") +
labs(
title = paste("Player Mean", jump_col, ": 2PT vs 3PT"),
subtitle = "Points above dashed line = higher 3PT than 2PT",
x = "Mean 2PT Jump Height",
y = "Mean 3PT Jump Height"
) +
theme_minimal() +
coord_equal()
print(p_paired)
p_spaghetti <- player_means %>%
ggplot(aes(x = shot_type, y = mean_value, group = player_id, color = player_id)) +
geom_line(alpha = 0.7, linewidth = 1) +
geom_point(size = 3) +
labs(
title = paste("Player-Level Mean", jump_col, "by Shot Type"),
x = "Shot Type",
y = paste("Mean", jump_col),
color = "Player"
) +
theme_minimal()
print(p_spaghetti)
cor_matrix <- df %>%
select(all_of(numeric_cols)) %>%
cor(use = "complete.obs")
# Convert to long format
cor_long <- cor_matrix %>%
as.data.frame() %>%
rownames_to_column("var1") %>%
pivot_longer(-var1, names_to = "var2", values_to = "correlation")
p_cor <- cor_long %>%
ggplot(aes(x = var1, y = var2, fill = correlation)) +
geom_tile() +
scale_fill_gradient2(
low = "#2166ac", mid = "white", high = "#b2182b",
midpoint = 0, limits = c(-1, 1)
) +
labs(title = "Correlation Matrix", x = "", y = "") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
axis.text.y = element_text(size = 7)
)
print(p_cor)
strong_cors <- cor_long %>%
filter(var1 < var2) %>%
filter(abs(correlation) > 0.7) %>%
arrange(desc(abs(correlation))) %>%
mutate(correlation = round(correlation, 3))
kable(strong_cors %>% head(20), caption = "Strongest Correlations (|r| > 0.7)")
| var1 | var2 | correlation |
|---|---|---|
| left_leg_peak_propulsive_force | peak_propulsive_force | 0.923 |
| force_at_min_displacement | peak_braking_force | 0.877 |
| peak_propulsive_force | right_leg_peak_propulsive_force | 0.876 |
| peak_braking_force | peak_propulsive_force | 0.711 |
overall_make_rate <- mean(df$made, na.rm = TRUE)
cat("Overall make rate:", round(overall_make_rate * 100, 1), "%\n")
## Overall make rate: 64.1 %
make_by_type <- df %>%
group_by(shot_type) %>%
summarise(
n = n(),
makes = sum(made, na.rm = TRUE),
make_rate = round(mean(made, na.rm = TRUE) * 100, 1),
.groups = "drop"
)
kable(make_by_type, caption = "Make Rate by Shot Type")
| shot_type | n | makes | make_rate |
|---|---|---|---|
| 2PT | 230 | 167 | 72.6 |
| 3PT | 230 | 128 | 55.7 |
p_make_type <- make_by_type %>%
ggplot(aes(x = shot_type, y = make_rate, fill = shot_type)) +
geom_col(alpha = 0.8) +
geom_hline(yintercept = overall_make_rate * 100, linetype = "dashed", color = "red") +
scale_fill_manual(values = c("2PT" = "#1f77b4", "3PT" = "#ff7f0e")) +
labs(title = "Make Rate by Shot Type",
subtitle = paste("Red line = overall (", round(overall_make_rate * 100, 1), "%)"),
x = "Shot Type", y = "Make Rate (%)") +
theme_minimal() +
theme(legend.position = "none")
print(p_make_type)
make_by_player <- df %>%
group_by(player_id) %>%
summarise(
n = n(),
makes = sum(made, na.rm = TRUE),
make_rate = round(mean(made, na.rm = TRUE) * 100, 1),
.groups = "drop"
) %>%
arrange(desc(make_rate))
kable(make_by_player, caption = "Make Rate by Player (Ranked)")
| player_id | n | makes | make_rate |
|---|---|---|---|
| 16 | 20 | 16 | 80 |
| 17 | 20 | 16 | 80 |
| 9 | 20 | 16 | 80 |
| 13 | 20 | 15 | 75 |
| 21 | 20 | 15 | 75 |
| 3 | 20 | 15 | 75 |
| 2 | 20 | 14 | 70 |
| 22 | 20 | 14 | 70 |
| 1 | 20 | 13 | 65 |
| 15 | 20 | 13 | 65 |
| 19 | 20 | 13 | 65 |
| 6 | 20 | 13 | 65 |
| 11 | 20 | 12 | 60 |
| 12 | 20 | 12 | 60 |
| 14 | 20 | 12 | 60 |
| 20 | 20 | 12 | 60 |
| 5 | 20 | 12 | 60 |
| 7 | 20 | 12 | 60 |
| 18 | 20 | 11 | 55 |
| 4 | 20 | 11 | 55 |
| 10 | 20 | 10 | 50 |
| 23 | 20 | 9 | 45 |
| 8 | 20 | 9 | 45 |
p_make_player <- make_by_player %>%
ggplot(aes(x = reorder(player_id, make_rate), y = make_rate)) +
geom_col(fill = "steelblue", alpha = 0.8) +
geom_hline(yintercept = overall_make_rate * 100, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = "Make Rate by Player",
subtitle = paste("Red line = overall (", round(overall_make_rate * 100, 1), "%)"),
x = "Player ID", y = "Make Rate (%)") +
theme_minimal()
print(p_make_player)
p_jump_make <- df %>%
ggplot(aes(x = .data[[jump_col]], y = made)) +
geom_jitter(alpha = 0.4, height = 0.1, width = 0, color = "steelblue") +
geom_smooth(method = "loess", se = TRUE, color = "red", fill = "red", alpha = 0.2) +
labs(
title = paste("Make Probability vs", jump_col, "(Descriptive Only)"),
subtitle = "Red line = LOESS smooth",
x = jump_col,
y = "Made (0/1)"
) +
theme_minimal()
print(p_jump_make)
cat("## Summary\n\n")
# Completeness
cat("### Data Completeness\n\n")
high_missing <- missingness_by_col %>% filter(pct_missing > 5)
if (nrow(high_missing) > 0) {
cat("- **Columns with >5% missing:**", paste(high_missing$column, collapse = ", "), "\n")
} else {
cat("- ✓ No columns with >5% missing data\n")
}
incomplete <- trials_per_player %>% filter(n_trials != 20)
if (nrow(incomplete) > 0) {
cat("- **Players with incomplete trials:**", paste(incomplete$player_id, collapse = ", "), "\n")
} else {
cat("- ✓ All players have expected 20 trials\n")
}
# Artifact detection
cat("\n### Potential Artifact Variables\n\n")
artifact_flags <- numeric_summary %>%
mutate(p99_p1_ratio = ifelse(p1 != 0 & !is.na(p1), p99 / p1, NA)) %>%
filter(p99_p1_ratio > 100 | min < 0 | is.na(p99_p1_ratio))
if (nrow(artifact_flags) > 0) {
cat("- **Flagged variables:**", paste(artifact_flags$variable, collapse = ", "), "\n")
cat(" - (Extreme p99/p1 ratios or negative minimums)\n")
} else {
cat("- ✓ No obvious artifact variables detected\n")
}
# Stability
cat("\n### Variable Stability\n\n")
cv_summary <- numeric_summary %>%
mutate(cv = sd / abs(mean)) %>%
arrange(cv)
stable_vars <- cv_summary %>% filter(cv < 0.3) %>% pull(variable)
noisy_vars <- cv_summary %>% filter(cv > 0.5) %>% pull(variable)
cat("- **Most stable (CV < 0.3):**", paste(head(stable_vars, 5), collapse = ", "), "\n")
cat("- **Most variable (CV > 0.5):**", paste(head(noisy_vars, 5), collapse = ", "), "\n")
# Collinearity
cat("\n### Collinearity Patterns\n\n")
if (nrow(strong_cors) > 0) {
cat("- Found **", nrow(strong_cors), " variable pairs** with |r| > 0.7\n")
cat("- **Top correlations:**\n")
top_cors <- strong_cors %>% slice_head(n = 5)
for (i in 1:nrow(top_cors)) {
cat(" - ", top_cors$var1[i], " ↔ ", top_cors$var2[i],
" (r = ", top_cors$correlation[i], ")\n", sep = "")
}
} else {
cat("- No variable pairs with |r| > 0.7\n")
}
# Shot type differences
if (has_shot_type) {
cat("\n### Shot Type Observations\n\n")
cat("- Review violin/boxplots for systematic 2PT vs 3PT differences\n")
cat("- Jump height is typically higher for 3PT shots (greater distance)\n")
}
# Make rate
if (has_made) {
cat("\n### Make Rate Observations\n\n")
cat("- **Overall make rate:**", round(overall_make_rate * 100, 1), "%\n")
cat("- **Player variation:**", round(min(make_by_player$make_rate), 1), "% –",
round(max(make_by_player$make_rate), 1), "%\n")
}
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gt_1.3.0 skimr_2.2.2 naniar_1.1.0 here_1.0.2
## [5] knitr_1.51 janitor_2.2.1 lubridate_1.9.5 forcats_1.0.1
## [9] stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1 readr_2.1.6
## [13] tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 xml2_1.5.2 lattice_0.22-9
## [5] stringi_1.8.7 hms_1.1.4 digest_0.6.39 magrittr_2.0.4
## [9] evaluate_1.0.5 grid_4.5.1 timechange_0.4.0 RColorBrewer_1.1-3
## [13] fastmap_1.2.0 Matrix_1.7-4 rprojroot_2.1.1 jsonlite_2.0.0
## [17] mgcv_1.9-4 scales_1.4.0 jquerylib_0.1.4 cli_3.6.5
## [21] crayon_1.5.3 rlang_1.1.7 splines_4.5.1 bit64_4.6.0-1
## [25] base64enc_0.1-6 withr_3.0.2 repr_1.1.7 cachem_1.1.0
## [29] yaml_2.3.12 otel_0.2.0 parallel_4.5.1 tools_4.5.1
## [33] tzdb_0.5.0 vctrs_0.7.1 R6_2.6.1 lifecycle_1.0.5
## [37] snakecase_0.11.1 bit_4.6.0 fs_1.6.6 vroom_1.7.0
## [41] pkgconfig_2.0.3 pillar_1.11.1 bslib_0.10.0 gtable_0.3.6
## [45] glue_1.8.0 visdat_0.6.0 xfun_0.56 tidyselect_1.2.1
## [49] farver_2.1.2 nlme_3.1-168 htmltools_0.5.9 labeling_0.4.3
## [53] rmarkdown_2.30 compiler_4.5.1 S7_0.2.1